United States Car Accident Project
1 Indroduction
1.1 Motivation for the Project - James
1.2 Research Questions -James
1.3 Overview of Modeling Techniques - Me
Binary Response Model
Linear Regression
Lasso Regression
Ridge Regression
Ordinal Logit Regression
Decision Tree
Random Forest
Artificial Neural Network
2 Raw Data
3 Preparing Data For Machine Learning
accident_12var <- accident_raw %>%
select(Severity,State, `Temperature(F)`, `Humidity(%)`,
`Visibility(mi)`, `Wind_Speed(mph)`, Weather_Condition,
`Precipitation(in)`, Crossing, Junction, Traffic_Signal,
Sunrise_Sunset)
colnames(accident_12var) <- gsub("\\)|\\%|\\(", ".", colnames(accident_12var))library(caret)
library(recipes)
library(dplyr)
# Split the data into training and testing sets
set.seed(2)
train_indices <- createDataPartition(accident_12var$Severity, p = 0.8, list = FALSE)
train_set <- accident_12var[train_indices, ]
test_set <- accident_12var[-train_indices, ]
# TRAIN SET
# Make a copy of the train set
copied_traindata <- data.frame(train_set)
# Add an id column to copied_traindata
copied_traindata <- copied_traindata %>% mutate(id = row_number())
# Separate Label from Feature
accident <- select(copied_traindata, -Severity) # drop Severity column
label <- copied_traindata$Severity # select Severity column
# Separate Numerical from Categorical
accident_num <- accident %>%
select(id, Temperature.F., Humidity..., Visibility.mi., Wind_Speed.mph., Precipitation.in.)
accident_cat <- accident %>%
select(id, State, Weather_Condition, Crossing, Junction, Traffic_Signal, Sunrise_Sunset)
# Define numeric and categorical attributes
num_attribs <- names(accident_num)[2:6]
cat_attribs <- names(accident_cat)[2:7]
# Define preprocessing pipelines
num_pipeline <- recipe(~., data = accident_num) %>%
step_impute_median(all_numeric(), -has_role("id")) %>%
step_center(all_numeric(), -has_role("id")) %>%
step_scale(all_numeric(), -has_role("id"))
cat_pipeline <- recipe(~., data = accident_cat) %>%
step_dummy(all_nominal())
# Merge the preprocessed numerical and categorical features into a single dataset
accident <- accident %>% rename(Index = id)
df1 <- mutate(num_pipeline %>% prep() %>% bake(new_data = NULL), join_key = "Index")
df2 <- mutate(cat_pipeline %>% prep() %>% bake(new_data = NULL), join_key = "Index")
accident_prepared <- accident %>%
select(-one_of(c(cat_attribs, num_attribs)))
accident_prepared <- cbind(accident_prepared, df1,df2)
accident_prepared <- accident_prepared %>%
distinct()
accident_prepared <- select(accident_prepared, -c("Index", "id", "join_key", "id.1", "join_key.1"))
#TEST SET
# Make a copy of the test set
copied_testdata <- data.frame(test_set)
# Add an id column to copied_testdata
copied_testdata <- copied_testdata %>% mutate(id = row_number())
# Separate Label from Feature
accident_test <- select(copied_testdata, -Severity) # drop Severity column
label_test <- copied_testdata$Severity # select Severity column
# Separate Numerical from Categorical
accident_num_test <- copied_testdata %>%
select(Temperature.F., Humidity..., Visibility.mi., Wind_Speed.mph., Precipitation.in.)
accident_cat_test <- copied_testdata %>%
select(State, Weather_Condition, Crossing, Junction, Traffic_Signal, Sunrise_Sunset)
# Define numeric and categorical attributes
num_attribs <- names(accident_num_test)[1:6]
cat_attribs <- names(accident_cat_test)[1:7]
# Define preprocessing pipelines
num_pipeline <- recipe(~., data = accident_num_test) %>%
step_impute_median(all_numeric(), -has_role("id")) %>%
step_center(all_numeric(), -has_role("id")) %>%
step_scale(all_numeric(), -has_role("id"))
cat_pipeline <- recipe(~., data = accident_cat_test) %>%
step_dummy(all_nominal())
# Merge the preprocessed numerical and categorical features into a single dataset
copied_testdata <- copied_testdata %>% rename(Index = id)
df1 <- mutate(num_pipeline %>% prep() %>% bake(new_data = NULL), join_key = "Index")
df2 <- mutate(cat_pipeline %>% prep() %>% bake(new_data = NULL), join_key = "Index")
accident_prepared_test <- accident_test %>%
select(-one_of(c(cat_attribs, num_attribs)))
accident_prepared_test <- cbind(accident_prepared_test, df1,df2)
accident_prepared_test <- accident_prepared_test %>%
distinct()
accident_prepared_test <- select(accident_prepared_test, -c("id", "join_key", "join_key.1"))4 Models
4.1 Linear Regression
# Fit the linear regression model
lin_reg <- lm(label ~ ., data = accident_prepared)
# Use the model to predict the response variable using the test data
y_pred <- predict(lin_reg, newdata = accident_prepared_test)
# Calculate the residuals
residuals <- y_pred - label_test
# Calculate the squared errors
squared_errors <- residuals^2
# Calculate the mean squared error
mse <- mean(squared_errors)
# Print the MSE
cat("MSE:", mse)
## MSE: 0.13249144.2 Ridge Regression
#ridge regression
library(glmnet)
# Separate the predictor variables from the response variable
y <- label
X <- as.matrix(select(accident_prepared, -label))
# Define the lambda sequence for ridge regression
lambda_seq <- 10^seq(10, -2, length = 100)
# Perform cross-validated ridge regression
ridge_fit <- cv.glmnet(X, y, alpha = 0, lambda = lambda_seq)
# Plot the cross-validation results
plot(ridge_fit)
ridge_coef <- coef(ridge_fit)[-1]
y_pred <- predict(ridge_fit, newx = X)
mse <- mean((y - y_pred)^2)
# Print the MSE
cat("MSE:", mse)
## MSE: 0.13411024.3 Lasso Regression
x <- model.matrix(~ ., data = accident_prepared)
y <- label
# Fit a Lasso regression with cross-validation
lasso_model <- cv.glmnet(x, y, alpha = 1)
extra_columns <- setdiff(colnames(accident_prepared_test), colnames(accident_prepared))
accident_prepared_test <- accident_prepared_test %>%
select(-one_of(extra_columns))
# Predict the response variable using the test data
x_test <- model.matrix(~ ., data = accident_prepared_test)
y_pred <- predict(lasso_model, newx = x_test)
# Calculate the MSE
mse <- mean((y_pred - label_test)^2)
# Print the MSE
cat("MSE:", mse)
## MSE: 0.134835
plot(lasso_model)
5 Results
5.1 Linear Regression Results
library(stargazer)
stargazer(lin_reg, type = "text", title="Regression Results", ci=TRUE, ci.level=0.90, single.row=TRUE,dep.var.caption = "Severity of Car Accident")
##
## Regression Results
## ==========================================================================
## Severity of Car Accident
## ---------------------------
## label
## --------------------------------------------------------------------------
## Temperature.F. 0.002 (-0.001, 0.005)
## Humidity... 0.007*** (0.004, 0.010)
## Visibility.mi. -0.002 (-0.005, 0.002)
## Wind_Speed.mph. -0.001 (-0.004, 0.002)
## Precipitation.in. -0.0002 (-0.003, 0.002)
## Crossing -0.036*** (-0.043, -0.029)
## Junction 0.019 (-0.014, 0.052)
## Traffic_Signal -0.014*** (-0.021, -0.007)
## State_AR 0.150*** (0.101, 0.199)
## State_AZ -0.199*** (-0.233, -0.165)
## State_CA -0.104*** (-0.135, -0.074)
## State_CO 0.554*** (0.506, 0.602)
## State_CT 0.503*** (0.451, 0.555)
## State_DC 0.149*** (0.103, 0.196)
## State_DE 0.375*** (0.308, 0.441)
## State_FL -0.097*** (-0.127, -0.066)
## State_GA 0.601*** (0.552, 0.650)
## State_IA 0.400*** (0.336, 0.463)
## State_ID 0.006 (-0.042, 0.053)
## State_IL 0.290*** (0.254, 0.327)
## State_IN 0.594*** (0.540, 0.648)
## State_KS 0.112 (-0.007, 0.231)
## State_KY 0.024 (-0.044, 0.092)
## State_LA -0.119*** (-0.152, -0.086)
## State_MA 0.538*** (0.447, 0.630)
## State_MD 0.193*** (0.157, 0.229)
## State_ME -0.162* (-0.316, -0.009)
## State_MI 0.084*** (0.048, 0.120)
## State_MN -0.105*** (-0.138, -0.072)
## State_MO 0.006 (-0.039, 0.050)
## State_MS 0.135*** (0.057, 0.214)
## State_MT -0.094*** (-0.133, -0.056)
## State_NC -0.018 (-0.049, 0.014)
## State_ND -0.119* (-0.220, -0.019)
## State_NE 0.311*** (0.215, 0.407)
## State_NH 0.617*** (0.477, 0.758)
## State_NJ 0.271*** (0.233, 0.310)
## State_NM -0.101 (-0.245, 0.044)
## State_NV 0.196*** (0.112, 0.280)
## State_NY 0.157*** (0.124, 0.190)
## State_OH 0.030 (-0.010, 0.069)
## State_OK -0.008 (-0.050, 0.035)
## State_OR -0.053*** (-0.084, -0.021)
## State_PA 0.012 (-0.020, 0.043)
## State_RI 0.109 (-0.092, 0.311)
## State_SC -0.098*** (-0.130, -0.067)
## State_SD 0.078 (-0.105, 0.261)
## State_TN -0.087*** (-0.120, -0.053)
## State_TX -0.093*** (-0.125, -0.062)
## State_UT -0.057*** (-0.094, -0.021)
## State_VA 0.071*** (0.038, 0.103)
## State_VT -0.236** (-0.428, -0.043)
## State_WA 0.242*** (0.202, 0.281)
## State_WI 1.430*** (1.339, 1.520)
## State_WV 0.066 (-0.005, 0.136)
## Weather_Condition_Blowing.Dust...Windy 0.023 (-0.457, 0.503)
## Weather_Condition_Blowing.Snow...Windy -0.175 (-0.815, 0.466)
## Weather_Condition_Clear 0.273 (-0.062, 0.607)
## Weather_Condition_Cloudy 0.015 (-0.212, 0.241)
## Weather_Condition_Cloudy...Windy 0.028 (-0.201, 0.258)
## Weather_Condition_Drizzle 0.136 (-0.110, 0.381)
## Weather_Condition_Drizzle.and.Fog -0.068 (-0.444, 0.307)
## Weather_Condition_Fair 0.020 (-0.207, 0.246)
## Weather_Condition_Fair...Windy 0.052 (-0.176, 0.280)
## Weather_Condition_Fog 0.007 (-0.220, 0.234)
## Weather_Condition_Fog...Windy -0.007 (-0.340, 0.327)
## Weather_Condition_Freezing.Drizzle -0.183 (-0.823, 0.457)
## Weather_Condition_Haze 0.010 (-0.217, 0.238)
## Weather_Condition_Haze...Windy 0.057 (-0.202, 0.315)
## Weather_Condition_Heavy.Drizzle -0.057 (-0.470, 0.356)
## Weather_Condition_Heavy.Rain 0.008 (-0.223, 0.238)
## Weather_Condition_Heavy.Rain...Windy 0.196 (-0.099, 0.492)
## Weather_Condition_Heavy.Sleet 0.030 (-0.610, 0.670)
## Weather_Condition_Heavy.Snow 0.038 (-0.213, 0.290)
## Weather_Condition_Heavy.Snow...Windy 0.017 (-0.463, 0.498)
## Weather_Condition_Heavy.T.Storm -0.008 (-0.241, 0.226)
## Weather_Condition_Heavy.T.Storm...Windy -0.165 (-0.467, 0.138)
## Weather_Condition_Light.Drizzle 0.040 (-0.192, 0.271)
## Weather_Condition_Light.Drizzle...Windy 0.659*** (0.308, 1.010)
## Weather_Condition_Light.Freezing.Drizzle -0.054 (-0.404, 0.297)
## Weather_Condition_Light.Freezing.Rain 0.252 (-0.050, 0.554)
## Weather_Condition_Light.Freezing.Rain...Windy 0.871*** (0.496, 1.247)
## Weather_Condition_Light.Ice.Pellets 0.814*** (0.334, 1.294)
## Weather_Condition_Light.Rain 0.029 (-0.198, 0.255)
## Weather_Condition_Light.Rain...Windy 0.057 (-0.177, 0.292)
## Weather_Condition_Light.Rain.Shower 0.143 (-0.233, 0.519)
## Weather_Condition_Light.Rain.with.Thunder -0.007 (-0.238, 0.223)
## Weather_Condition_Light.Snow 0.067 (-0.161, 0.294)
## Weather_Condition_Light.Snow...Windy 0.048 (-0.194, 0.291)
## Weather_Condition_Light.Snow.and.Sleet -0.011 (-0.651, 0.629)
## Weather_Condition_Light.Snow.Shower -0.023 (-0.663, 0.618)
## Weather_Condition_Light.Thunderstorms.and.Rain 0.613 (-0.028, 1.253)
## Weather_Condition_Mist 0.123 (-0.162, 0.408)
## Weather_Condition_Mostly.Cloudy 0.006 (-0.221, 0.232)
## Weather_Condition_Mostly.Cloudy...Windy 0.022 (-0.208, 0.252)
## Weather_Condition_N.A.Precipitation -0.091 (-0.345, 0.163)
## Weather_Condition_Overcast 0.498*** (0.263, 0.734)
## Weather_Condition_Partly.Cloudy 0.016 (-0.211, 0.242)
## Weather_Condition_Partly.Cloudy...Windy -0.004 (-0.238, 0.230)
## Weather_Condition_Patches.of.Fog -0.003 (-0.255, 0.250)
## Weather_Condition_Rain 0.014 (-0.214, 0.242)
## Weather_Condition_Rain...Windy 0.036 (-0.231, 0.303)
## Weather_Condition_Scattered.Clouds -0.157 (-0.570, 0.256)
## Weather_Condition_Shallow.Fog -0.055 (-0.304, 0.194)
## Weather_Condition_Showers.in.the.Vicinity -0.080 (-0.493, 0.334)
## Weather_Condition_Sleet -0.215 (-0.695, 0.265)
## Weather_Condition_Smoke 0.131 (-0.099, 0.360)
## Weather_Condition_Smoke...Windy 1.000*** (0.520, 1.480)
## Weather_Condition_Snow 0.081 (-0.154, 0.316)
## Weather_Condition_Snow...Windy -0.002 (-0.322, 0.318)
## Weather_Condition_Snow.and.Sleet -0.119 (-0.759, 0.521)
## Weather_Condition_T.Storm 0.025 (-0.205, 0.255)
## Weather_Condition_T.Storm...Windy -0.016 (-0.429, 0.397)
## Weather_Condition_Thunder 0.011 (-0.219, 0.241)
## Weather_Condition_Thunder...Windy -0.025 (-0.315, 0.264)
## Weather_Condition_Thunder...Wintry.Mix -0.253 (-0.894, 0.388)
## Weather_Condition_Thunder.and.Hail 0.004 (-0.636, 0.643)
## Weather_Condition_Thunder.in.the.Vicinity 0.033 (-0.195, 0.262)
## Weather_Condition_Widespread.Dust 0.005 (-0.635, 0.644)
## Weather_Condition_Widespread.Dust...Windy 0.021 (-0.619, 0.661)
## Weather_Condition_Wintry.Mix 0.040 (-0.194, 0.274)
## Weather_Condition_Wintry.Mix...Windy -0.008 (-0.648, 0.631)
## Sunrise_Sunset_Night 0.012*** (0.006, 0.017)
## Constant 2.096*** (1.867, 2.324)
## --------------------------------------------------------------------------
## Observations 75,921
## R2 0.108
## Adjusted R2 0.107
## Residual Std. Error 0.364 (df = 75797)
## F Statistic 74.585*** (df = 123; 75797)
## ==========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
library(coefplot)
library(broom)
# Extract coefficients and standard errors
coef_df <- tidy(lin_reg, conf.int = TRUE)
# Filter out intercept
coef_df <- coef_df[-1,]
num_coef_df <- coef_df[coef_df$term %in% num_attribs,]
cat_coef_df <- coef_df[grep(".*\\_.*", coef_df$term), ]
# Create plots
plot_num <- ggplot(num_coef_df, aes(x = estimate, y = reorder(term, estimate))) +
geom_point(size = 2) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
labs(x = "Coefficient Estimate", y = "Variable") +
ggtitle("Linear Regression Results for Numeric Variables") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))+
geom_vline(xintercept = 0, linetype = "dashed", color = "red")
plot_num
cat_coef_df1 <- cat_coef_df[1:25,]
cat_coef_df2 <- cat_coef_df[25:50,]
cat_coef_df3 <- cat_coef_df[50:75,]
cat_coef_df4 <- cat_coef_df[75:100,]
cat_coef_df5 <- cat_coef_df[100:125,]
# Create separate plots
plot_cat1 <- ggplot(cat_coef_df1, aes(x = estimate, y = reorder(term, estimate))) +
geom_point(size = 2) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
labs(x = "Coefficient Estimate", y = "Variable") +
ggtitle("Linear Regression Results for Categorical Variables") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))+
geom_vline(xintercept = 0, linetype = "dashed", color = "red")
plot_cat1
plot_cat2 <- ggplot(cat_coef_df2, aes(x = estimate, y = reorder(term, estimate))) +
geom_point(size = 2) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
labs(x = "Coefficient Estimate", y = "Variable") +
ggtitle("Linear Regression Results for Categorical Variables") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))+
geom_vline(xintercept = 0, linetype = "dashed", color = "red")
plot_cat2
plot_cat3 <- ggplot(cat_coef_df3, aes(x = estimate, y = reorder(term, estimate))) +
geom_point(size = 2) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
labs(x = "Coefficient Estimate", y = "Variable") +
ggtitle("Linear Regression Results for Categorical Variables") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))+
geom_vline(xintercept = 0, linetype = "dashed", color = "red")
plot_cat3
plot_cat4 <- ggplot(cat_coef_df4, aes(x = estimate, y = reorder(term, estimate))) +
geom_point(size = 2) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
labs(x = "Coefficient Estimate", y = "Variable") +
ggtitle("Linear Regression Results for Categorical Variables") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))+
geom_vline(xintercept = 0, linetype = "dashed", color = "red")
plot_cat4
plot_cat5 <- ggplot(cat_coef_df5, aes(x = estimate, y = reorder(term, estimate))) +
geom_point(size = 2) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
labs(x = "Coefficient Estimate", y = "Variable") +
ggtitle("Linear Regression Results for Categorical Variables") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))+
geom_vline(xintercept = 0, linetype = "dashed", color = "red")
plot_cat5